library("readxl")
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("ggplot2")
library("rockchalk")
## 
## Attaching package: 'rockchalk'
## The following object is masked from 'package:dplyr':
## 
##     summarize
library("Hmisc")
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:rockchalk':
## 
##     summarize
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library("psych")
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(olsrr)
## Registered S3 methods overwritten by 'car':
##   method                          from
##   influence.merMod                lme4
##   cooks.distance.influence.merMod lme4
##   dfbeta.influence.merMod         lme4
##   dfbetas.influence.merMod        lme4
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
## The following object is masked from 'package:dplyr':
## 
##     recode
library(lm.beta) 
library(userfriendlyscience)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'userfriendlyscience'
## The following object is masked from 'package:Hmisc':
## 
##     escapeRegex
## The following object is masked from 'package:lattice':
## 
##     oneway
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

Reading excel file

academic <- read_excel("dataAcademic.xlsx")
## New names:
## * `` -> ...10

Size of the dataset

cat("Dimension of data", dim(academic),"\n")
## Dimension of data 12411 45
cat("Total number of rows",nrow(academic),fill=TRUE)
## Total number of rows 12411
cat("Total number of columns",ncol(academic),fill=TRUE)
## Total number of columns 45

Extracting sample data : Total 10% of the sample is extracted from the population. Since the observation is extracted randomly we can say that it is representative of the entire population.

sample_academic <- dplyr::sample_n(academic, size=ceiling((0.1*nrow(academic)))
)
sample_academic
## # A tibble: 1,242 x 45
##    COD_S11 GENDER EDU_FATHER EDU_MOTHER OCC_FATHER OCC_MOTHER STRATUM SISBEN
##    <chr>   <chr>  <chr>      <chr>      <chr>      <chr>      <chr>   <chr> 
##  1 SB1120… F      Complete … Complete … Operator   Other occ… Stratu… Level…
##  2 SB1120… M      Incomplet… Incomplet… Independe… Home       Stratu… Level…
##  3 SB1120… F      Complete … Complete … Entrepren… Home       Stratu… It is…
##  4 SB1120… F      Complete … Complete … 0          Technical… Stratu… It is…
##  5 SB1120… M      Complete … Incomplet… Other occ… Home       Stratu… It is…
##  6 SB1120… M      Complete … Complete … Independe… Independe… Stratu… It is…
##  7 SB1120… F      Complete … Complete … Independe… Independe… Stratu… Level…
##  8 SB1120… F      Complete … Incomplet… Operator   Home       Stratu… Level…
##  9 SB1120… M      Incomplet… Complete … Independe… Operator   Stratu… Level…
## 10 SB1120… F      Complete … Complete … Technical… Home       Stratu… It is…
## # … with 1,232 more rows, and 37 more variables: PEOPLE_HOUSE <chr>,
## #   ...10 <lgl>, INTERNET <chr>, TV <chr>, COMPUTER <chr>, WASHING_MCH <chr>,
## #   MIC_OVEN <chr>, CAR <chr>, DVD <chr>, FRESH <chr>, PHONE <chr>,
## #   MOBILE <chr>, REVENUE <chr>, JOB <chr>, SCHOOL_NAME <chr>,
## #   SCHOOL_NAT <chr>, SCHOOL_TYPE <chr>, MAT_S11 <dbl>, CR_S11 <dbl>,
## #   CC_S11 <dbl>, BIO_S11 <dbl>, ENG_S11 <dbl>, Cod_SPro <chr>,
## #   UNIVERSITY <chr>, ACADEMIC_PROGRAM <chr>, QR_PRO <dbl>, CR_PRO <dbl>,
## #   CC_PRO <dbl>, ENG_PRO <dbl>, WC_PRO <dbl>, FEP_PRO <dbl>, G_SC <dbl>,
## #   PERCENTILE <dbl>, `2ND_DECILE` <dbl>, QUARTILE <dbl>, SEL <dbl>,
## #   SEL_IHE <dbl>

Variables of interest and their description: Mathematics, Biology, Written Communication, Global Score, Percentile,TV, REVENUE, FRESH

varInterest <- c("MAT_S11","BIO_S11","WC_PRO","G_SC","PERCENTILE","TV","REVENUE","ACADEMIC_PROGRAM","FRESH")

Assessing for missing data

cat("Missing observations in the entire dataset",sum(is.na(academic)))
## Missing observations in the entire dataset 12411

Total number of missing points in the entire dataset is 12411, which indicates an entire column. When the dataset is observed carefully, an entire column was missing.

Summary of variable of interest from the sample.

sdatasubset<-academic[varInterest]#subset considering variables of interest
summary(sdatasubset) #summary of variables interest. 
##     MAT_S11          BIO_S11           WC_PRO           G_SC      
##  Min.   : 26.00   Min.   : 11.00   Min.   :  0.0   Min.   : 37.0  
##  1st Qu.: 56.00   1st Qu.: 56.00   1st Qu.: 28.0   1st Qu.:147.0  
##  Median : 64.00   Median : 64.00   Median : 56.0   Median :163.0  
##  Mean   : 64.32   Mean   : 63.95   Mean   : 53.7   Mean   :162.7  
##  3rd Qu.: 72.00   3rd Qu.: 71.00   3rd Qu.: 80.0   3rd Qu.:179.0  
##  Max.   :100.00   Max.   :100.00   Max.   :100.0   Max.   :247.0  
##    PERCENTILE          TV              REVENUE          ACADEMIC_PROGRAM  
##  Min.   :  1.00   Length:12411       Length:12411       Length:12411      
##  1st Qu.: 51.00   Class :character   Class :character   Class :character  
##  Median : 75.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 68.45                                                           
##  3rd Qu.: 90.00                                                           
##  Max.   :100.00                                                           
##     FRESH          
##  Length:12411      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

The following code represents if there is any variables of interest missing in the sample.

res<-summary(VIM::aggr(sdatasubset, sortVar=TRUE))$combinations

## 
##  Variables sorted by number of missings: 
##          Variable Count
##           MAT_S11     0
##           BIO_S11     0
##            WC_PRO     0
##              G_SC     0
##        PERCENTILE     0
##                TV     0
##           REVENUE     0
##  ACADEMIC_PROGRAM     0
##             FRESH     0

Description statistics for each general type of variable

Nomimal - Academic program is nominal data.

facAcademic <-factor(academic[varInterest][8])
facAcademic
## ACADEMIC_PROGRAM 
##             <NA> 
## Levels: c("INDUSTRIAL ENGINEERING", "ELECTRONIC ENGINEERING", "CIVIL ENGINEERING", "MECHANICAL ENGINEERING", "ELECTRIC ENGINEERING", "ELECTRIC ENGINEERING AND TELECOMMUNICATIONS", "CHEMICAL ENGINEERING", "AERONAUTICAL ENGINEERING", "MECHATRONICS ENGINEERING", "INDUSTRIAL AUTOMATIC ENGINEERING", "TRANSPORTATION AND ROAD ENGINEERING", "TOPOGRAPHIC ENGINEERY", "INDUSTRIAL CONTROL AND AUTOMATION ENGINEERING", "CONTROL ENGINEERING", "CATASTRAL ENGINEERING AND GEODESY", "PRODUCTION ENGINEERING", "PRODUCTIVITY AND QUALITY ENGINEERING", \n"CIVIL CONSTRUCTIONS", "ELECTROMECHANICAL ENGINEERING", "AUTOMATION ENGINEERING", "TEXTILE ENGINEERING")

Frequency counts in the sample

freqAcademic <- table(academic[varInterest][8])

freqAcademic # display total number of each categorical variable. 
## 
##                      AERONAUTICAL ENGINEERING 
##                                            44 
##                        AUTOMATION ENGINEERING 
##                                            10 
##             CATASTRAL ENGINEERING AND GEODESY 
##                                            78 
##                          CHEMICAL ENGINEERING 
##                                          1000 
##                           CIVIL CONSTRUCTIONS 
##                                            14 
##                             CIVIL ENGINEERING 
##                                          3320 
##                           CONTROL ENGINEERING 
##                                            20 
##                          ELECTRIC ENGINEERING 
##                                           278 
##   ELECTRIC ENGINEERING AND TELECOMMUNICATIONS 
##                                            47 
##                 ELECTROMECHANICAL ENGINEERING 
##                                            34 
##                        ELECTRONIC ENGINEERING 
##                                           849 
##              INDUSTRIAL AUTOMATIC ENGINEERING 
##                                            22 
## INDUSTRIAL CONTROL AND AUTOMATION ENGINEERING 
##                                             1 
##                        INDUSTRIAL ENGINEERING 
##                                          5318 
##                        MECHANICAL ENGINEERING 
##                                          1135 
##                      MECHATRONICS ENGINEERING 
##                                            82 
##                        PRODUCTION ENGINEERING 
##                                            60 
##          PRODUCTIVITY AND QUALITY ENGINEERING 
##                                            29 
##                           TEXTILE ENGINEERING 
##                                             1 
##                         TOPOGRAPHIC ENGINEERY 
##                                            42 
##           TRANSPORTATION AND ROAD ENGINEERING 
##                                            27
round(prop.table(freqAcademic),digits=2) # proportian from the frequency table
## 
##                      AERONAUTICAL ENGINEERING 
##                                          0.00 
##                        AUTOMATION ENGINEERING 
##                                          0.00 
##             CATASTRAL ENGINEERING AND GEODESY 
##                                          0.01 
##                          CHEMICAL ENGINEERING 
##                                          0.08 
##                           CIVIL CONSTRUCTIONS 
##                                          0.00 
##                             CIVIL ENGINEERING 
##                                          0.27 
##                           CONTROL ENGINEERING 
##                                          0.00 
##                          ELECTRIC ENGINEERING 
##                                          0.02 
##   ELECTRIC ENGINEERING AND TELECOMMUNICATIONS 
##                                          0.00 
##                 ELECTROMECHANICAL ENGINEERING 
##                                          0.00 
##                        ELECTRONIC ENGINEERING 
##                                          0.07 
##              INDUSTRIAL AUTOMATIC ENGINEERING 
##                                          0.00 
## INDUSTRIAL CONTROL AND AUTOMATION ENGINEERING 
##                                          0.00 
##                        INDUSTRIAL ENGINEERING 
##                                          0.43 
##                        MECHANICAL ENGINEERING 
##                                          0.09 
##                      MECHATRONICS ENGINEERING 
##                                          0.01 
##                        PRODUCTION ENGINEERING 
##                                          0.00 
##          PRODUCTIVITY AND QUALITY ENGINEERING 
##                                          0.00 
##                           TEXTILE ENGINEERING 
##                                          0.00 
##                         TOPOGRAPHIC ENGINEERY 
##                                          0.00 
##           TRANSPORTATION AND ROAD ENGINEERING 
##                                          0.00

Ordinal : Revenue is ordinal data.

freqRevenue <- table(academic[varInterest][7])
freqRevenue
## 
##                               0                 10 or more LMMW 
##                             279                             718 
##  Between 1 and less than 2 LMMW  Between 2 and less than 3 LMMW 
##                            3873                            2783 
##  Between 3 and less than 5 LMMW  Between 5 and less than 7 LMMW 
##                            2239                             973 
## Between 7 and less than 10 LMMW                less than 1 LMMW 
##                             509                            1037
round(prop.table(freqRevenue),digits=2) # proportian from the frequency table
## 
##                               0                 10 or more LMMW 
##                            0.02                            0.06 
##  Between 1 and less than 2 LMMW  Between 2 and less than 3 LMMW 
##                            0.31                            0.22 
##  Between 3 and less than 5 LMMW  Between 5 and less than 7 LMMW 
##                            0.18                            0.08 
## Between 7 and less than 10 LMMW                less than 1 LMMW 
##                            0.04                            0.08

Interval - MAT_S11 is example for interval data. Maths score is normally distributed, therefore mean, count, standard deviation is derived.

cat("Range of math's score in the sample",range(academic[varInterest][1]),"\n")
## Range of math's score in the sample 26 100
cat("Summay of math's score")
## Summay of math's score
summary(academic[varInterest][1],"\n")
##     MAT_S11      
##  Min.   : 26.00  
##  1st Qu.: 56.00  
##  Median : 64.00  
##  Mean   : 64.32  
##  3rd Qu.: 72.00  
##  Max.   :100.00
cat("Mean of math's score in the sample",mean(as.numeric(unlist(academic['MAT_S11']))),"\n")
## Mean of math's score in the sample 64.32076
cat("Standard deviation of math's score",
sd(as.numeric(unlist(academic['MAT_S11']))),"\n")
## Standard deviation of math's score 11.87365
cat("Minimum of math's score",
min(as.numeric(unlist(academic['MAT_S11']))),"\n")
## Minimum of math's score 26
cat("Maximum of math's score",
max(as.numeric(unlist(academic['MAT_S11']))),"\n")
## Maximum of math's score 100
psych::describe(as.numeric(unlist(academic[varInterest][1])))
##    vars     n  mean    sd median trimmed   mad min max range skew kurtosis   se
## X1    1 12411 64.32 11.87     64   63.84 11.86  26 100    74  0.4     0.13 0.11
Hmisc::describe(academic[varInterest][1])
## academic[varInterest][1] 
## 
##  1  Variables      12411  Observations
## --------------------------------------------------------------------------------
## MAT_S11 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    12411        0       69    0.999    64.32    13.33       47       50 
##      .25      .50      .75      .90      .95 
##       56       64       72       81       86 
## 
## lowest :  26  31  32  35  36, highest:  96  97  98  99 100
## --------------------------------------------------------------------------------
pastecs::stat.desc(academic[varInterest][1], basic=F)
##                  MAT_S11
## median        64.0000000
## mean          64.3207638
## SE.mean        0.1065813
## CI.mean.0.95   0.2089158
## var          140.9835648
## std.dev       11.8736500
## coef.var       0.1846006

Visualization

Nominal data - Academic program

unique(academic[varInterest][8])
## # A tibble: 21 x 1
##    ACADEMIC_PROGRAM                           
##    <chr>                                      
##  1 INDUSTRIAL ENGINEERING                     
##  2 ELECTRONIC ENGINEERING                     
##  3 CIVIL ENGINEERING                          
##  4 MECHANICAL ENGINEERING                     
##  5 ELECTRIC ENGINEERING                       
##  6 ELECTRIC ENGINEERING AND TELECOMMUNICATIONS
##  7 CHEMICAL ENGINEERING                       
##  8 AERONAUTICAL ENGINEERING                   
##  9 MECHATRONICS ENGINEERING                   
## 10 INDUSTRIAL AUTOMATIC ENGINEERING           
## # … with 11 more rows
x <- table(academic[varInterest][8])

barplot(x,
main = "Student academic program",
xlab = "Academic program",
col = rainbow(length(x)), names.arg=(c("AE","Auto","CEG","ChE","CC","CE","CoE","EE","EET","ElE","EltE","IAE","ICAE","IE","ME","MecE","PE","PQE","TE","ToE","TRE")) )

piepercent<- round(100*x/sum(x), 1)

cat("Percentage of students who live in rural and urban area: \n")
## Percentage of students who live in rural and urban area:
print(piepercent)
## 
##                      AERONAUTICAL ENGINEERING 
##                                           0.4 
##                        AUTOMATION ENGINEERING 
##                                           0.1 
##             CATASTRAL ENGINEERING AND GEODESY 
##                                           0.6 
##                          CHEMICAL ENGINEERING 
##                                           8.1 
##                           CIVIL CONSTRUCTIONS 
##                                           0.1 
##                             CIVIL ENGINEERING 
##                                          26.8 
##                           CONTROL ENGINEERING 
##                                           0.2 
##                          ELECTRIC ENGINEERING 
##                                           2.2 
##   ELECTRIC ENGINEERING AND TELECOMMUNICATIONS 
##                                           0.4 
##                 ELECTROMECHANICAL ENGINEERING 
##                                           0.3 
##                        ELECTRONIC ENGINEERING 
##                                           6.8 
##              INDUSTRIAL AUTOMATIC ENGINEERING 
##                                           0.2 
## INDUSTRIAL CONTROL AND AUTOMATION ENGINEERING 
##                                           0.0 
##                        INDUSTRIAL ENGINEERING 
##                                          42.8 
##                        MECHANICAL ENGINEERING 
##                                           9.1 
##                      MECHATRONICS ENGINEERING 
##                                           0.7 
##                        PRODUCTION ENGINEERING 
##                                           0.5 
##          PRODUCTIVITY AND QUALITY ENGINEERING 
##                                           0.2 
##                           TEXTILE ENGINEERING 
##                                           0.0 
##                         TOPOGRAPHIC ENGINEERY 
##                                           0.3 
##           TRANSPORTATION AND ROAD ENGINEERING 
##                                           0.2
pie(table(academic[varInterest][8]), labels = c("AE","CEG","ChE","CC","CE","CoE","EE","EET","ElE","EltE","IE","ME","MecE","PE","PQE","TE","TRE"), main = "Student academic program",col = rainbow(length(x)))

Ordinal data - Revenue

x <- table(academic[varInterest][7])

barplot(table(academic[varInterest][7]),
main = "Student academic program",
xlab = "Revenue",
col = rainbow(length(x)), names.arg=(c("Zero","10 or more","1 & 2","2 & 3","3 & 5","5 & 7 ","7 & 10","< 1")) )

piepercent<- round(100*x/sum(x), 1)

cat("Percentage of students who live in rural and urban area: \n")
## Percentage of students who live in rural and urban area:
print(piepercent)
## 
##                               0                 10 or more LMMW 
##                             2.2                             5.8 
##  Between 1 and less than 2 LMMW  Between 2 and less than 3 LMMW 
##                            31.2                            22.4 
##  Between 3 and less than 5 LMMW  Between 5 and less than 7 LMMW 
##                            18.0                             7.8 
## Between 7 and less than 10 LMMW                less than 1 LMMW 
##                             4.1                             8.4
pie(table(academic[varInterest][8]), labels = c("Zero","10 or more","1 & 2","2 & 3","3 & 5","5 & 7 ","7 & 10","< 1"), main = "Student academic program",col = rainbow(length(x)))

Interval data - MAT_S11 i.e. score obtained in mathematics.

gs <- ggplot(academic, aes(x=MAT_S11))
gs <- gs + labs(x="Maths score")
gs <- gs + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gs <- gs + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
gs <- gs + stat_function(fun=dnorm, color="red",args=list(mean=mean(as.numeric(unlist(academic['MAT_S11'])), na.rm=TRUE), sd=sd(as.numeric(unlist(academic['MAT_S11'])), na.rm=TRUE)))
gs

Assessing variable for normality

H0 : Mathematics score is not normally distributed. H1 : Mathematics score is normally distributed.

qqnorm(academic$MAT_S11)
qqline(academic$MAT_S11) #show a line on theplot

tpskew<-semTools::skew(as.numeric(unlist(academic[varInterest][1])))
tpkurt<-semTools::kurtosis(as.numeric(unlist((academic[varInterest][1]))))
tpskew[1]/tpskew[2]
## skew (g1) 
##  18.17215
tpkurt[1]/tpkurt[2]
## Excess Kur (g2) 
##        2.954064
zmaths<- abs(scale(academic[varInterest][1]))

ex <- FSA::perc(as.numeric(zmaths), 1.96, "gt")
ex
## [1] 4.447667
FSA::perc(as.numeric(zmaths), 3.29, "gt")
## [1] 0
pastecs::stat.desc(academic$MAT_S11, basic=F)
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
##   64.0000000   64.3207638    0.1065813    0.2089158  140.9835648   11.8736500 
##     coef.var 
##    0.1846006

Report of normality: Maths score was assessed for normality. Visual inspection of the histogram and QQ-Plot did not exhibit any issues with skewness and kurtosis. The standardized score for skewness(18.17) and standardized score for kurtosis (2.95) are not within acceptable range using the criteria proposed by West, Finch and Curran (1996). All the data points of standardized scores for mathematics fall within the bounds +/- 3.29, using the guidance of Field, Miles and Field (2013) the data can be considered to be a normal distribution (m=63.32, sd=11.87, n=12411).

Correlation

Correlation between mathematics score and biology score is checked.

Let us consider compute normality of biology score.

H0 : Biology score is not normally distributed. H1 : Biology score is normally distributed.

gs <- ggplot(academic, aes(x=BIO_S11))
gs <- gs + labs(x="Biology score")
gs <- gs + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gs <- gs + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
gs <- gs + stat_function(fun=dnorm, color="red",args=list(mean=mean(as.numeric(unlist(academic['BIO_S11'])), na.rm=TRUE), sd=sd(as.numeric(unlist(academic['BIO_S11'])), na.rm=TRUE)))
gs

qqnorm(academic$BIO_S11)
qqline(academic$BIO_S11) #show a line on theplot

tpskew<-semTools::skew(as.numeric(unlist(academic[varInterest][2])))
tpkurt<-semTools::kurtosis(as.numeric(unlist((academic[varInterest][2]))))
tpskew[1]/tpskew[2]
## skew (g1) 
##   13.7976
tpkurt[1]/tpkurt[2]
## Excess Kur (g2) 
##        6.800432
zmaths<- abs(scale(academic[varInterest][2]))

ex <- FSA::perc(as.numeric(zmaths), 1.96, "gt")
ex
## [1] 5.559584
FSA::perc(as.numeric(zmaths), 3.29, "gt")
## [1] 0.0322295
pastecs::stat.desc(academic$BIO_S11, basic=F)
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
##   64.0000000   63.9505278    0.1001472    0.1963041  124.4757151   11.1568685 
##     coef.var 
##    0.1744609

Report of normality: Biology score was assessed for normality. Visual inspection of the histogram and QQ-Plot did not exhibit any issues with skewness and kurtosis. The standardized score for skewness(5.78) and standardized score for kurtosis () is not within acceptable range 99.97% the datapoints of standardized scores for biology fall within the bounds +/- 3.29, using the guidance of Field, Miles and Field (2013) the data can be considered to be a normal distribution (m=63.95, sd=11.15, n=12412).

Since both the variables are normally distributed Pearson correlation is considered.

Correlation test:

H0 : There is no relationship between mathematics score and biology score. H1 : There is a relationship between mathematics score and biology score.

x <- academic$BIO_S11
y <- academic$MAT_S11

plot(x, y, main = "Correlation between maths and biology score",
     xlab = "Biology", ylab = "Mathematics",
     pch = 20, frame = FALSE)

abline(lm(y ~ x, data = academic), col = "blue")

cor.test(as.numeric(unlist(academic[varInterest][2])), as.numeric(unlist(academic[varInterest][1])), method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(unlist(academic[varInterest][2])) and as.numeric(unlist(academic[varInterest][1]))
## t = 131.52, df = 12409, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7556332 0.7703337
## sample estimates:
##       cor 
## 0.7630821

Report correlation The relationship between mathematics score and biology was investigated using a Pearson correlation. A strong positive correlation was found (r =0.76, n=12409, p < .05).

Assessing normality of written communication.

H0 : Written communication score is not normally distributed. H1 : Written communication score is normally distributed.

gs <- ggplot(academic, aes(x=WC_PRO))
gs <- gs + labs(x="Written communication score")
gs <- gs + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gs <- gs + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
gs <- gs + stat_function(fun=dnorm, color="red",args=list(mean=mean(as.numeric(unlist(academic['WC_PRO'])), na.rm=TRUE), sd=sd(as.numeric(unlist(academic['WC_PRO'])), na.rm=TRUE)))
gs

qqnorm(academic$WC_PRO)
qqline(academic$WC_PRO) 

tpskew<-semTools::skew(as.numeric(unlist(academic[varInterest][3])))
tpkurt<-semTools::kurtosis(as.numeric(unlist((academic[varInterest][3]))))
tpskew[1]/tpskew[2]
## skew (g1) 
##  -8.12618
tpkurt[1]/tpkurt[2]
## Excess Kur (g2) 
##       -27.37953
zwcpro<- abs(scale(academic[varInterest][3]))

ex <- FSA::perc(as.numeric(zwcpro), 1.96, "gt")
ex
## [1] 0
FSA::perc(as.numeric(zwcpro), 3.29, "gt")
## [1] 0
pastecs::stat.desc(academic$WC_PRO, basic=F)
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
##   56.0000000   53.7034083    0.2693041    0.5278778  900.1040488   30.0017341 
##     coef.var 
##    0.5586561

Reporting normality Written communication score was assessed for normality. Visual inspection of the histogram and QQ-Plot did exhibit issues with skewness and kurtosis. The standardized score for skewness(-8.13) and standardised score for kurtosis (-27.38) is not within the acceptable range. All the data points of standardized scores for written communication fall within the bounds +/- 3.29, using the guidance of Field, Miles and Field (2013) the data can be considered to be a normal distribution (m=53.7, sd=30, n=12411).

Identifying the correlation between mathematics and written communication score

H0 : There is no relationship between mathematics score and written communication score. H1 : There is a relationship between mathematics score and written communication score.

x <- academic$WC_PRO
y <- academic$MAT_S11

plot(x, y, main = "Correlation between maths and written communication score",
     xlab = "WC_PRO", ylab = "Mathematics",
     pch = 20, frame = FALSE)

abline(lm(y ~ x, data = academic), col = "blue")

cor.test(as.numeric(unlist(academic[varInterest][1])), as.numeric(unlist(academic[varInterest][3])), method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(unlist(academic[varInterest][1])) and as.numeric(unlist(academic[varInterest][3]))
## t = 23.593, df = 12409, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1902951 0.2239720
## sample estimates:
##      cor 
## 0.207195

Reporting correlation The relationship between mathematics score and written communication was investigated using a Pearson correlation. A very weak positive correlation was found (r =0.2, n=12409, p < .05).

Checking normality of percentile

H0: Percentile is not normally distributed. H1: Percentile is normally distributed.

gs <- ggplot(academic, aes(x=PERCENTILE))
gs <- gs + labs(x="PERCENTILE")
gs <- gs + geom_histogram(binwidth=2, colour="black", aes(y=..density.., fill=..count..))
gs <- gs + scale_fill_gradient("Count", low="#DCDCDC", high="#7C7C7C")
gs <- gs + stat_function(fun=dnorm, color="red",args=list(mean=mean(as.numeric(unlist(academic['PERCENTILE'])), na.rm=TRUE), sd=sd(as.numeric(unlist(academic['PERCENTILE'])), na.rm=TRUE)))
gs

qqnorm(academic$PERCENTILE)
qqline(academic$PERCENTILE) #show a line on theplot

tpskew<-semTools::skew(as.numeric(unlist(academic[varInterest][5])))
tpkurt<-semTools::kurtosis(as.numeric(unlist((academic[varInterest][5]))))
tpskew[1]/tpskew[2]
## skew (g1) 
## -34.10691
tpkurt[1]/tpkurt[2]
## Excess Kur (g2) 
##       -10.37217
zperc<- abs(scale(academic[varInterest][5]))

ex <- FSA::perc(as.numeric(zperc), 1.96, "gt")
ex
## [1] 5.269519
FSA::perc(as.numeric(zperc), 3.29, "gt")
## [1] 0
pastecs::stat.desc(academic$PERCENTILE, basic=F)
##       median         mean      SE.mean CI.mean.0.95          var      std.dev 
##   75.0000000   68.4464588    0.2321945    0.4551372  669.1301508   25.8675502 
##     coef.var 
##    0.3779239

Reporting normality Percentile score was assessed for normality. Visual inspection of the histogram and QQ-Plot did exhibhit issues with skewness and kurtosis. The standardized score for skewness(-34.1) and standardised score kurtosis (-10.37) is not within acceptable range. All the datapoints of standardized scores for written communication fall within the bounds +/- 3.29, using the guidance of Field, Miles and Field (2013) the data can be considered to be a normal distribution (m=68.44, sd=25.87, n=12411).

H0 : There is no relationship between global score and percentile. H1 : There is a relationship between global score and percentile .

x <- academic$G_SC
y <- academic$PERCENTILE

plot(x, y, main = "Correlation between maths and written communication score",
     xlab = "G_SC", ylab = "Percentile",
     pch = 20, frame = FALSE)

abline(lm(y ~ x, data = academic), col = "blue")

cor.test(as.numeric(unlist(academic[varInterest][4])), as.numeric(unlist(academic[varInterest][5])), method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  as.numeric(unlist(academic[varInterest][4])) and as.numeric(unlist(academic[varInterest][5]))
## t = 400.92, df = 12409, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9622178 0.9647402
## sample estimates:
##       cor 
## 0.9635004

Reporting correlation The relationship between global score and percentile was investigated using a Pearson correlation. A very strong positive correlation was found (r =0.96, n=12409, p < .05).

Difference test involving 2 groups

Variable TV has 2 outcomes, this indicated whether the respondent has TV or not.

H0: There is no difference between the global score those who have TV and those do not have TV. H1: There is a difference between the global score those who have TV and those do not have TV.

psych::describeBy(as.numeric(unlist(academic[varInterest][4])), academic$TV, mat=TRUE)
##     item group1 vars     n     mean       sd median  trimmed     mad min max
## X11    1     No    1  1842 155.4598 22.74521    155 155.5197 23.7216  77 228
## X12    2    Yes    1 10569 163.9742 22.94365    164 164.2314 23.7216  37 247
##     range        skew    kurtosis        se
## X11   151 -0.03963129 -0.13022982 0.5299625
## X12   210 -0.10536223 -0.05546219 0.2231750
car::leveneTest(G_SC ~ TV, data=academic)
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value Pr(>F)
## group     1   0.649 0.4205
##       12409
#Pr(>F) - it is not statistically significant so we can assume homogeneity

stats::t.test(G_SC ~ TV ,var.equal=TRUE,data=academic)
## 
##  Two Sample t-test
## 
## data:  G_SC by TV
## t = -14.716, df = 12409, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -9.648411 -7.380276
## sample estimates:
##  mean in group No mean in group Yes 
##          155.4598          163.9742
#Statistically significant difference was found

res <- stats::t.test(G_SC ~ TV ,var.equal=TRUE,data=academic)

effcd=round((2*res$statistic)/sqrt(res$parameter),2)

effectsize::t_to_d(t = res$statistic, res$parameter)
##     d |         95% CI
## ----------------------
## -0.26 | [-0.30, -0.23]
#Eta squared calculation
effes=round((res$statistic*res$statistic)/((res$statistic*res$statistic)+(res$parameter)),3)
effes
##     t 
## 0.017

Reporting difference test

An independent-samples t-test was conducted to compare global score for respondents who have TV and those who are do not have TV Significance difference in the scores for global score who have TV (M=163.97, SD=22.94 for respondents who have TV, M=155.46, SD=22.74 for respondents who do not have TV), (t(1.2409\times 10^{4})= -14.716, p = 0. Cohen's d also indicated a very small effect size (-0.26).

Difference test for Revenue:

academic$rv <- as.numeric(factor(academic$REVENUE))

psych::describeBy(academic$G_SC, academic$REVENUE, mat=TRUE)
##     item                          group1 vars    n     mean       sd median
## X11    1                               0    1  279 169.7168 22.63700    172
## X12    2                 10 or more LMMW    1  718 183.6727 20.72211    185
## X13    3  Between 1 and less than 2 LMMW    1 3873 156.7705 22.04578    157
## X14    4  Between 2 and less than 3 LMMW    1 2783 160.9429 21.63347    162
## X15    5  Between 3 and less than 5 LMMW    1 2239 165.3363 22.07567    167
## X16    6  Between 5 and less than 7 LMMW    1  973 170.8602 21.78682    172
## X17    7 Between 7 and less than 10 LMMW    1  509 176.1768 19.96434    178
## X18    8                less than 1 LMMW    1 1037 153.3144 22.01468    153
##      trimmed     mad min max range         skew     kurtosis        se
## X11 170.7244 22.2390  37 228   191 -0.943347983  3.641999614 1.3552419
## X12 184.2622 19.2738 114 246   132 -0.280403866  0.326046072 0.7733424
## X13 156.7670 22.2390  75 237   162 -0.014011136 -0.033082696 0.3542433
## X14 161.2236 22.2390  72 247   175 -0.122777144 -0.096571942 0.4100810
## X15 165.7959 22.2390  91 238   147 -0.170770997 -0.206646615 0.4665377
## X16 171.5212 23.7216  76 239   163 -0.310366669  0.352605142 0.6984535
## X17 176.9487 19.2738 118 240   122 -0.290540077  0.008686752 0.8849039
## X18 153.2575 22.2390  81 226   145  0.003089345 -0.036601838 0.6836331
stats::bartlett.test(G_SC~ rv, data=academic)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  G_SC by rv
## Bartlett's K-squared = 14.006, df = 7, p-value = 0.05108
#p value > 0.5 , we can assume homogeneity.

userfriendlyscience::oneway(as.factor(academic$REVENUE),y=academic$G_SC,posthoc='Tukey')
## ### Oneway Anova for y=G_SC and x=REVENUE (groups: 0, 10 or more LMMW, Between 1 and less than 2 LMMW, Between 2 and less than 3 LMMW, Between 3 and less than 5 LMMW, Between 5 and less than 7 LMMW, Between 7 and less than 10 LMMW, less than 1 LMMW)
## Registered S3 methods overwritten by 'ufs':
##   method                     from               
##   grid.draw.ggProportionPlot userfriendlyscience
##   pander.associationMatrix   userfriendlyscience
##   pander.dataShape           userfriendlyscience
##   pander.descr               userfriendlyscience
##   pander.normalityAssessment userfriendlyscience
##   print.CramersV             userfriendlyscience
##   print.associationMatrix    userfriendlyscience
##   print.confIntOmegaSq       userfriendlyscience
##   print.confIntV             userfriendlyscience
##   print.dataShape            userfriendlyscience
##   print.descr                userfriendlyscience
##   print.ggProportionPlot     userfriendlyscience
##   print.meanConfInt          userfriendlyscience
##   print.multiVarFreq         userfriendlyscience
##   print.normalityAssessment  userfriendlyscience
##   print.regrInfluential      userfriendlyscience
##   print.scaleDiagnosis       userfriendlyscience
##   print.scaleStructure       userfriendlyscience
##   print.scatterMatrix        userfriendlyscience
## Omega squared: 95% CI = [.1; .12], point estimate = .11
## Eta Squared: 95% CI = [.1; .12], point estimate = .11
## 
##                                         SS    Df        MS      F     p
## Between groups (error + effect)   738464.9     7 105494.99 222.12 <.001
## Within groups (error only)      5890791.92 12403    474.95             
## 
## 
## ### Post hoc test: Tukey
## 
##                                                                diff   lwr   
## 10 or more LMMW-0                                              13.96  9.3   
## Between 1 and less than 2 LMMW-0                               -12.95 -17.04
## Between 2 and less than 3 LMMW-0                               -8.77  -12.92
## Between 3 and less than 5 LMMW-0                               -4.38  -8.57 
## Between 5 and less than 7 LMMW-0                               1.14   -3.34 
## Between 7 and less than 10 LMMW-0                              6.46   1.54  
## less than 1 LMMW-0                                             -16.4  -20.86
## Between 1 and less than 2 LMMW-10 or more LMMW                 -26.9  -29.59
## Between 2 and less than 3 LMMW-10 or more LMMW                 -22.73 -25.5 
## Between 3 and less than 5 LMMW-10 or more LMMW                 -18.34 -21.17
## Between 5 and less than 7 LMMW-10 or more LMMW                 -12.81 -16.06
## Between 7 and less than 10 LMMW-10 or more LMMW                -7.5   -11.32
## less than 1 LMMW-10 or more LMMW                               -30.36 -33.57
## Between 2 and less than 3 LMMW-Between 1 and less than 2 LMMW  4.17   2.53  
## Between 3 and less than 5 LMMW-Between 1 and less than 2 LMMW  8.57   6.81  
## Between 5 and less than 7 LMMW-Between 1 and less than 2 LMMW  14.09  11.72 
## Between 7 and less than 10 LMMW-Between 1 and less than 2 LMMW 19.41  16.29 
## less than 1 LMMW-Between 1 and less than 2 LMMW                -3.46  -5.77 
## Between 3 and less than 5 LMMW-Between 2 and less than 3 LMMW  4.39   2.52  
## Between 5 and less than 7 LMMW-Between 2 and less than 3 LMMW  9.92   7.46  
## Between 7 and less than 10 LMMW-Between 2 and less than 3 LMMW 15.23  12.05 
## less than 1 LMMW-Between 2 and less than 3 LMMW                -7.63  -10.03
## Between 5 and less than 7 LMMW-Between 3 and less than 5 LMMW  5.52   2.99  
## Between 7 and less than 10 LMMW-Between 3 and less than 5 LMMW 10.84  7.6   
## less than 1 LMMW-Between 3 and less than 5 LMMW                -12.02 -14.5 
## Between 7 and less than 10 LMMW-Between 5 and less than 7 LMMW 5.32   1.7   
## less than 1 LMMW-Between 5 and less than 7 LMMW                -17.55 -20.49
## less than 1 LMMW-Between 7 and less than 10 LMMW               -22.86 -26.44
##                                                                upr    p adj
## 10 or more LMMW-0                                              18.62  <.001
## Between 1 and less than 2 LMMW-0                               -8.85  <.001
## Between 2 and less than 3 LMMW-0                               -4.63  <.001
## Between 3 and less than 5 LMMW-0                               -0.19  .033 
## Between 5 and less than 7 LMMW-0                               5.63   .994 
## Between 7 and less than 10 LMMW-0                              11.38  .002 
## less than 1 LMMW-0                                             -11.95 <.001
## Between 1 and less than 2 LMMW-10 or more LMMW                 -24.22 <.001
## Between 2 and less than 3 LMMW-10 or more LMMW                 -19.96 <.001
## Between 3 and less than 5 LMMW-10 or more LMMW                 -15.5  <.001
## Between 5 and less than 7 LMMW-10 or more LMMW                 -9.56  <.001
## Between 7 and less than 10 LMMW-10 or more LMMW                -3.67  <.001
## less than 1 LMMW-10 or more LMMW                               -27.15 <.001
## Between 2 and less than 3 LMMW-Between 1 and less than 2 LMMW  5.81   <.001
## Between 3 and less than 5 LMMW-Between 1 and less than 2 LMMW  10.32  <.001
## Between 5 and less than 7 LMMW-Between 1 and less than 2 LMMW  16.46  <.001
## Between 7 and less than 10 LMMW-Between 1 and less than 2 LMMW 22.52  <.001
## less than 1 LMMW-Between 1 and less than 2 LMMW                -1.15  <.001
## Between 3 and less than 5 LMMW-Between 2 and less than 3 LMMW  6.27   <.001
## Between 5 and less than 7 LMMW-Between 2 and less than 3 LMMW  12.38  <.001
## Between 7 and less than 10 LMMW-Between 2 and less than 3 LMMW 18.42  <.001
## less than 1 LMMW-Between 2 and less than 3 LMMW                -5.22  <.001
## Between 5 and less than 7 LMMW-Between 3 and less than 5 LMMW  8.06   <.001
## Between 7 and less than 10 LMMW-Between 3 and less than 5 LMMW 14.08  <.001
## less than 1 LMMW-Between 3 and less than 5 LMMW                -9.54  <.001
## Between 7 and less than 10 LMMW-Between 5 and less than 7 LMMW 8.93   <.001
## less than 1 LMMW-Between 5 and less than 7 LMMW                -14.6  <.001
## less than 1 LMMW-Between 7 and less than 10 LMMW               -19.29 <.001
res2<-stats::aov(G_SC~ REVENUE, data = academic)


fstat<-summary(res2)[[1]][["F value"]][[1]]

aovpvalue<-summary(res2)[[1]][["Pr(>F)"]][[1]]

aoveta<-sjstats::eta_sq(res2)[2]

A one-way between-groups analysis of variance (ANOVA) was conducted to explore the impact of global score on levels of revenue, as measured by the Life orientation Test (LOT). Participants were divided into eight groups according to their revenue (Group 1: 0 ; Group 2: 10 or more LMMW ; Group 3: Between 1 and less than 2 LMMW ; Group 4: Between 2 and less than 3 LMMW ; Group 5: Between 3 and less than 5 LMMW ; Group 6: Between 5 and less than 7 LMMW; Group 7: Between 7 and less than 10 LMMW ; Group 8: less than 1 LMMW). There was a statistically significant difference at the p < .05 level in global scores for the eight revenue groups: (F(2, 1.240310^{4})= 222.12, p<0.05. Despite reaching statistical significance, the actual difference in mean scores between groups was quite small. The effect size, calculated using eta squared was (0.11). Post-hoc comparisons using the Tukey HSD test indicated that the mean score for Group 1, Group 2, Group 3 and Group 5 were significantly different from each other. Group 1 and Group 4, Group 3 and Group 8, Group 5 and Group 6, Group 2 and Group 8 did not differ significantly from each other.

Build linear regression model1- Outcome variable: Global score ; Predictors: Mathematics and WC_PRO

H0: There is no relationship between global score and mathematics score, written communication score. H1: There is a relationship between global score and mathematics score, written communication score.

#Baseline model
model1=lm(academic$G_SC~academic$MAT_S11+academic$WC_PRO)
stargazer::stargazer(model1, type="text")
## 
## ================================================
##                         Dependent variable:     
##                     ----------------------------
##                                 G_SC            
## ------------------------------------------------
## MAT_S11                       1.076***          
##                               (0.011)           
##                                                 
## WC_PRO                        0.339***          
##                               (0.004)           
##                                                 
## Constant                     75.304***          
##                               (0.728)           
##                                                 
## ------------------------------------------------
## Observations                   12,411           
## R2                             0.600            
## Adjusted R2                    0.600            
## Residual Std. Error     14.615 (df = 12408)     
## F Statistic         9,314.261*** (df = 2; 12408)
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01

Reporting model 1 A simple linear regression was carried out to investigate whether written communication and mathematics score predicts global score. A significant regression equation was found (F(2,12408)= 14.615, p < .05), with R2 of 0.6. Global score = 75.30 + 1.08 * (MAT_S11) + 0.34 * (WC_PRO). Mathematics score and written communication score is significant predictor of global score.

The R2 value of 0.6 indicates that 60% of the variation in global score can be explained by the model containing mathematics score and written communication.

#Influential Outliers - Cook's distance
cooksd<-sort(cooks.distance(model1))

#Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obsservations by Cooks distance")  
abline(h = 4*mean(cooksd, na.rm=T), col="red")  
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red")  # add labels

#Cook's distance is used to find the influential outliers in a set of predictor variables. It helps to identify the points that negatively affect regression model. Red line in the figure corresponds to the recommended threshold value (4 * mean). There are three Cook's distance values that are relatively higher than the others, which exceed the threshold value. 

sum((cooks.distance(model1))>4*mean(cooksd))
## [1] 618
#618
#There are 618 outliers in the dataset i.e. 0.05% of the entire population.   

influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])  
stem(influential)
## 
##   The decimal point is 3 digit(s) to the right of the |
## 
##    0 | 000111112222223333344455555566666678888888899
##    1 | 0000111111222333333333344444555555667888899999
##    2 | 000000001111233445566667777888899999
##    3 | 00111111222222233333344444444555567777778888899
##    4 | 001111111222333334444444455555556666777777788888999999999
##    5 | 00000011111223333333333444445566666666666667778888899999
##    6 | 00011111111222222223333444444444555555556666677777778888888888999999
##    7 | 0000000111122222234444444445567778889999999999
##    8 | 000111112223334444444555555667778889
##    9 | 000000011112222222333444444555556666666666667777778888888999
##   10 | 0000011111122334444455555666666677777777888888899
##   11 | 0111112222222222333333444444555556678888899999
##   12 | 0011111111222223333334444
head(academic[influential, ])  # influential observations.
## # A tibble: 6 x 46
##   COD_S11 GENDER EDU_FATHER EDU_MOTHER OCC_FATHER OCC_MOTHER STRATUM SISBEN
##   <chr>   <chr>  <chr>      <chr>      <chr>      <chr>      <chr>   <chr> 
## 1 SB1120… M      Complete … Complete … Other occ… Home       Stratu… Level…
## 2 SB1120… M      0          0          0          0          0       0     
## 3 SB1120… F      Complete … Complete … Technical… Technical… Stratu… Level…
## 4 SB1120… F      Postgradu… Postgradu… Technical… Executive  Stratu… It is…
## 5 SB1120… F      Complete … Complete … Auxiliary… Small ent… Stratu… Level…
## 6 SB1120… M      Postgradu… Complete … 0          Home       Stratu… It is…
## # … with 38 more variables: PEOPLE_HOUSE <chr>, ...10 <lgl>, INTERNET <chr>,
## #   TV <chr>, COMPUTER <chr>, WASHING_MCH <chr>, MIC_OVEN <chr>, CAR <chr>,
## #   DVD <chr>, FRESH <chr>, PHONE <chr>, MOBILE <chr>, REVENUE <chr>,
## #   JOB <chr>, SCHOOL_NAME <chr>, SCHOOL_NAT <chr>, SCHOOL_TYPE <chr>,
## #   MAT_S11 <dbl>, CR_S11 <dbl>, CC_S11 <dbl>, BIO_S11 <dbl>, ENG_S11 <dbl>,
## #   Cod_SPro <chr>, UNIVERSITY <chr>, ACADEMIC_PROGRAM <chr>, QR_PRO <dbl>,
## #   CR_PRO <dbl>, CC_PRO <dbl>, ENG_PRO <dbl>, WC_PRO <dbl>, FEP_PRO <dbl>,
## #   G_SC <dbl>, PERCENTILE <dbl>, `2ND_DECILE` <dbl>, QUARTILE <dbl>,
## #   SEL <dbl>, SEL_IHE <dbl>, rv <dbl>
head(academic[influential, ]$G_SC)  
## [1] 104 182 125 201 108 222
# 4  90  73 100  32  92 are influential observations of global score
head(academic[influential, ]$MAT_S11) 
## [1]  47  49  49 100  52  86
#  47  49  49 100  52  86 are influential observations of maths
head(academic[influential, ]$WC_PRO)  
## [1]   4  90  73 100  32  92
# 4  90  73 100  32  92 are influntial observations of written communication

car::outlierTest(model1) 
##         rstudent unadjusted p-value Bonferroni p
## 1336  -10.671216         1.8078e-26   2.2437e-22
## 7721   -5.458853         4.8844e-08   6.0620e-04
## 10858  -5.374858         7.8030e-08   9.6843e-04
## 2021   -4.799068         1.6126e-06   2.0014e-02
## 173    -4.758351         1.9738e-06   2.4497e-02
#Observations 1336, 7721, 10858, 2021, 173 are outliers 

car::leveragePlots(model1) # leverage plots

plot(model1,1)

#Red line is horizontally drawn near 0, it indicates the presence of linear model. 

#Homocedasticity 
plot(model1, 3)

#These plot helps us to identify the homoscedasticity of the model. Since the data is not completely random distribution of points throughout the range of X axis and flat red line, we can conclude that graph is homoscedasticity.

#Create histogram and  density plot of the residuals
plot(density(resid(model1))) 

car::qqPlot(model1, main="QQ Plot") #qq plot for studentized resid

## [1] 1336 7721
#Collinearity
vifmodel<-car::vif(model1)
vifmodel
## academic$MAT_S11  academic$WC_PRO 
##         1.044855         1.044855
#tolerance
1/vifmodel
## academic$MAT_S11  academic$WC_PRO 
##        0.9570702        0.9570702

Build linear regression model2- Outcome variable: Global score ; Predictors: Mathematics, WC_PRO and Revenue

H0: There is no relationship between global score and mathematics score, written communication score, revenue. H1: There is a relationship between global score and mathematics score, written communication score, revenue.

model2=lm(academic$G_SC~academic$MAT_S11+academic$WC_PRO+academic$rv)

summary(model2)
## 
## Call:
## lm(formula = academic$G_SC ~ academic$MAT_S11 + academic$WC_PRO + 
##     academic$rv)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -155.739   -9.657    0.605    9.888   66.259 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      75.97773    0.80221  94.711   <2e-16 ***
## academic$MAT_S11  1.07564    0.01129  95.250   <2e-16 ***
## academic$WC_PRO   0.33918    0.00447  75.880   <2e-16 ***
## academic$rv      -0.15539    0.07763  -2.002   0.0453 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.61 on 12407 degrees of freedom
## Multiple R-squared:  0.6003, Adjusted R-squared:  0.6002 
## F-statistic:  6212 on 3 and 12407 DF,  p-value: < 2.2e-16
anova(model2)
## Analysis of Variance Table
## 
## Response: academic$G_SC
##                     Df  Sum Sq Mean Sq    F value  Pr(>F)    
## academic$MAT_S11     1 2748008 2748008 12868.6032 < 2e-16 ***
## academic$WC_PRO      1 1230958 1230958  5764.4332 < 2e-16 ***
## academic$rv          1     856     856     4.0073 0.04533 *  
## Residuals        12407 2649435     214                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Reporting Multiple linear regression was carried out to investigate the relationship between mathematics score, written communication score, revenue and global score. There was a significant relationship between mathematics score and global score (p < .001), written communication and global score (p < .001) and revenue and global score( p < .001). A significant regression equation was found (F(3,12407)= 6212, p < .05), Global score = 75.98 + 1.08 * (MAT_S11) + 0.33 * (WC_PRO) - 0.16(REVENUE) with R2 of 0.6. Mathematics score, written communication score and revenue are significant predictors of global score.

Revenue is converted as numeric factors. The R2 value of 0.6 indicates that 60% of the variation in global score can be explained by the model containing mathematics score, written communication and revenue.

#Influential Outliers - Cook's distance
cooksd<-sort(cooks.distance(model2))

#Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obsservations by Cooks distance")  
abline(h = 4*mean(cooksd, na.rm=T), col="red")  
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red")  # add labels

#Cook's distance is used to find the influential outliers in a set of predictor variables. It helps to identify the points that negatively affect regression model. Red line in the figure corresponds to the recommended threshold value (4 * mean). There are three Cook's distance values that are relatively higher than the others, which exceed the threshold value. 

sum((cooks.distance(model2))>4*mean(cooksd))
## [1] 619
#619
#There are 619 outliers in the dataset i.e. 0.05% of the entire population.   

influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])  
stem(influential)
## 
##   The decimal point is 3 digit(s) to the right of the |
## 
##    0 | 000011111122222333344445556666667788888999
##    1 | 00111111122222333333333334444455555556666778888999999
##    2 | 00000011112334445555566666667778889999
##    3 | 001111111122222333333344444444455555566777777888899
##    4 | 0001111111122222333333444444455555555666677777778888899999999
##    5 | 0000001111112223333333334445555666666666666777788889999
##    6 | 00001111112222222333444444555555555566667777777788888999999999
##    7 | 00011111222223334444444445567778888999999999
##    8 | 0001111112333344444455556677778889
##    9 | 0000000111111122222233334444455666666666677777788888888899
##   10 | 0000001111112333344445566666677777777888888888999
##   11 | 001111111222222222333334444445555555667888999999
##   12 | 001111111111222333334444
head(academic[influential, ])  # influential observations.
## # A tibble: 6 x 46
##   COD_S11 GENDER EDU_FATHER EDU_MOTHER OCC_FATHER OCC_MOTHER STRATUM SISBEN
##   <chr>   <chr>  <chr>      <chr>      <chr>      <chr>      <chr>   <chr> 
## 1 SB1120… F      Postgradu… Complete … Technical… Home       Stratu… Level…
## 2 SB1120… M      Incomplet… Incomplet… Other occ… Home       Stratu… Level…
## 3 SB1120… M      Complete … Postgradu… Independe… Technical… Stratu… Level…
## 4 SB1120… M      Complete … Complete … Independe… Operator   Stratu… Level…
## 5 SB1120… F      Complete … Complete … Technical… Auxiliary… Stratu… It is…
## 6 SB1120… M      Postgradu… Postgradu… Independe… Independe… Stratu… It is…
## # … with 38 more variables: PEOPLE_HOUSE <chr>, ...10 <lgl>, INTERNET <chr>,
## #   TV <chr>, COMPUTER <chr>, WASHING_MCH <chr>, MIC_OVEN <chr>, CAR <chr>,
## #   DVD <chr>, FRESH <chr>, PHONE <chr>, MOBILE <chr>, REVENUE <chr>,
## #   JOB <chr>, SCHOOL_NAME <chr>, SCHOOL_NAT <chr>, SCHOOL_TYPE <chr>,
## #   MAT_S11 <dbl>, CR_S11 <dbl>, CC_S11 <dbl>, BIO_S11 <dbl>, ENG_S11 <dbl>,
## #   Cod_SPro <chr>, UNIVERSITY <chr>, ACADEMIC_PROGRAM <chr>, QR_PRO <dbl>,
## #   CR_PRO <dbl>, CC_PRO <dbl>, ENG_PRO <dbl>, WC_PRO <dbl>, FEP_PRO <dbl>,
## #   G_SC <dbl>, PERCENTILE <dbl>, `2ND_DECILE` <dbl>, QUARTILE <dbl>,
## #   SEL <dbl>, SEL_IHE <dbl>, rv <dbl>
head(academic[influential, ]$G_SC)  
## [1] 205 154 114 140 191 227
# 205 154 114 140 191 227 are influential observations of global score
head(academic[influential, ]$MAT_S11) 
## [1] 60 81 45 56 96 91
# 60 81 45 56 96 91 are influential observations of maths
head(academic[influential, ]$WC_PRO)  
## [1] 58 38 34 80 96 98
# 58 38 34 80 96 98 are influntial observations of written communication

car::outlierTest(model2) 
##         rstudent unadjusted p-value Bonferroni p
## 1336  -10.709375         1.2025e-26   1.4925e-22
## 7721   -5.463285         4.7642e-08   5.9129e-04
## 10858  -5.336734         9.6302e-08   1.1952e-03
## 2021   -4.813386         1.5014e-06   1.8634e-02
## 173    -4.772609         1.8393e-06   2.2828e-02
#Observations 1336, 7721 are outliers 

car::leveragePlots(model2) # leverage plots

plot(model2,1)

#Red line is horizontally drawn near 0, it indicates the presence of linear model. 

#Homocedasticity 
plot(model2, 3)

#These plot helps us to identify the homoscedasticity of the model. Since the data is not completely random distribution of points throughout the range of X axis and flat red line, we can conclude that graph is homoscedasticity.
#Also, the has horizontal line with equally spread of 

#Create histogram and  density plot of the residuals
plot(density(resid(model1))) 

car::qqPlot(model1, main="QQ Plot") #qq plot for studentized resid

## [1] 1336 7721
#Collinearity
vifmodel<-car::vif(model2)
vifmodel
## academic$MAT_S11  academic$WC_PRO      academic$rv 
##         1.044858         1.045145         1.000282
#tolerance
1/vifmodel
## academic$MAT_S11  academic$WC_PRO      academic$rv 
##        0.9570683        0.9568046        0.9997182

Differential effect : Adding a term TV to understand differential effect.

Build linear regression model3 - Outcome variable: Global score ; Predictors: Mathematics, WC_PRO, Revenue and TV

H0: There is no relationship between global score and mathematics score, written communication score, revenue, TV. H1: There is a relationship between global score and mathematics score, written communication score, revenue, TV.

#Model 3 adding in TV 
academic$tvf=ifelse(academic$TV == "Yes", 1, ifelse(academic$TV == "No", 0, NA))

model3=lm(academic$G_SC~academic$MAT_S11+academic$WC_PRO+academic$rv+academic$tvf)

summary(model3)
## 
## Call:
## lm(formula = academic$G_SC ~ academic$MAT_S11 + academic$WC_PRO + 
##     academic$rv + academic$tvf)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -155.804   -9.678    0.670    9.831   65.841 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      73.541502   0.845204  87.010   <2e-16 ***
## academic$MAT_S11  1.065957   0.011310  94.251   <2e-16 ***
## academic$WC_PRO   0.338026   0.004458  75.829   <2e-16 ***
## academic$rv      -0.085373   0.077780  -1.098    0.272    
## academic$tvf      3.311155   0.371888   8.904   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.57 on 12406 degrees of freedom
## Multiple R-squared:  0.6029, Adjusted R-squared:  0.6028 
## F-statistic:  4708 on 4 and 12406 DF,  p-value: < 2.2e-16
anova(model3)
## Analysis of Variance Table
## 
## Response: academic$G_SC
##                     Df  Sum Sq Mean Sq    F value  Pr(>F)    
## academic$MAT_S11     1 2748008 2748008 12949.7903 < 2e-16 ***
## academic$WC_PRO      1 1230958 1230958  5800.8006 < 2e-16 ***
## academic$rv          1     856     856     4.0325 0.04465 *  
## academic$tvf         1   16823   16823    79.2749 < 2e-16 ***
## Residuals        12406 2632613     212                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Reporting Multiple linear regression was carried out to investigate the relationship between mathematics score, written communication score, revenue, TV and global score. There was a significant relationship between mathematics score and global score (p < .001), written communication and global score (p < .001), TV and revenue and global score( p < .001). A significant regression equation was found (F(4,12406)= 3001, p < .05), with R2 of 0.6. Mathematics score, written communication score and revenue are significant predictors of global score.

#Influential Outliers - Cook's distance
cooksd<-sort(cooks.distance(model3))

#Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obsservations by Cooks distance")  
abline(h = 4*mean(cooksd, na.rm=T), col="red")  
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red")  

sum((cooks.distance(model2))>4*mean(cooksd))
## [1] 629
#195
#There are only 195 outliers in the dataset i.e. 0.015% of the entire population.   

influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])  
stem(influential)
## 
##   The decimal point is 3 digit(s) to the right of the |
## 
##    0 | 0000011111222233333444455555566666777788888889999
##    1 | 0001111112222223333333333344444444555555556666677777788888899999999
##    2 | 00000001111122233334445555566666677777788899999
##    3 | 00011111111112222222333333334444444444555555666677777788888889999
##    4 | 0000001111111122222233333444444455556666777777888888999999
##    5 | 0000011112233333445556666666667777788888999
##    6 | 00001111112222223334444444555555555666677777777778888889999999999
##    7 | 0001111222233444555677788999999999
##    8 | 0000111111223344444455555556777889
##    9 | 0000000111122222233334444455566666666667777778888888889
##   10 | 000000111223344455556666677777788888888889999
##   11 | 000111111222222222333344444445555567788899999999
##   12 | 000111111111223333444
head(academic[influential, ])  # influential observations.
## # A tibble: 6 x 47
##   COD_S11 GENDER EDU_FATHER EDU_MOTHER OCC_FATHER OCC_MOTHER STRATUM SISBEN
##   <chr>   <chr>  <chr>      <chr>      <chr>      <chr>      <chr>   <chr> 
## 1 SB1120… M      Complete … Postgradu… Executive  Other occ… Stratu… It is…
## 2 SB1120… M      Incomplet… Incomplet… Small ent… Other occ… Stratu… It is…
## 3 SB1120… M      Postgradu… Complete … Technical… Technical… Stratu… Level…
## 4 SB1120… M      Complete … Complete … Technical… Home       Stratu… It is…
## 5 SB1120… M      Incomplet… Complete … Operator   Independe… Stratu… It is…
## 6 SB1120… F      Incomplet… Incomplet… 0          0          Stratu… Esta …
## # … with 39 more variables: PEOPLE_HOUSE <chr>, ...10 <lgl>, INTERNET <chr>,
## #   TV <chr>, COMPUTER <chr>, WASHING_MCH <chr>, MIC_OVEN <chr>, CAR <chr>,
## #   DVD <chr>, FRESH <chr>, PHONE <chr>, MOBILE <chr>, REVENUE <chr>,
## #   JOB <chr>, SCHOOL_NAME <chr>, SCHOOL_NAT <chr>, SCHOOL_TYPE <chr>,
## #   MAT_S11 <dbl>, CR_S11 <dbl>, CC_S11 <dbl>, BIO_S11 <dbl>, ENG_S11 <dbl>,
## #   Cod_SPro <chr>, UNIVERSITY <chr>, ACADEMIC_PROGRAM <chr>, QR_PRO <dbl>,
## #   CR_PRO <dbl>, CC_PRO <dbl>, ENG_PRO <dbl>, WC_PRO <dbl>, FEP_PRO <dbl>,
## #   G_SC <dbl>, PERCENTILE <dbl>, `2ND_DECILE` <dbl>, QUARTILE <dbl>,
## #   SEL <dbl>, SEL_IHE <dbl>, rv <dbl>, tvf <dbl>
head(academic[influential, ]$G_SC)  
## [1] 123 138 214 175 135 188
#228 224 102 183 196 203 are influential observations of global score
head(academic[influential, ]$MAT_S11) 
## [1] 65 67 73 64 53 52
#81 89 57 68 55 79 are influential observations of maths
head(academic[influential, ]$WC_PRO)  
## [1]  0 87 72 10 76 99
#100  94   0  88  96  89 are influntial observations of written communication

car::outlierTest(model3) 
##         rstudent unadjusted p-value Bonferroni p
## 1336  -10.747996         7.9486e-27   9.8650e-23
## 7721   -5.512795         3.6024e-08   4.4709e-04
## 10858  -5.407799         6.4987e-08   8.0655e-04
## 2021   -4.868291         1.1395e-06   1.4143e-02
## 173    -4.814642         1.4920e-06   1.8518e-02
#1336, 12376, 7721, 3718, 6980 are the outcome variables that have unusual variables for its predictor values. 

car::leveragePlots(model3) # leverage plots

plot(model3,1)

#Red line is horizontally drawn near 0, it indicates the presence of linear model. 

#Homocedasticity 
plot(model3, 3)

#These plot helps us to identify the homoscedasticity of the model. Since the data is not completely random distribution of points throughout the range of X axis and flat red line, we can conclude that graph is homoscedasticity.
#Also, the plot has horizontal line with equally spread of observations. 

#Create histogram and  density plot of the residuals
plot(density(resid(model3))) 

car::qqPlot(model3, main="QQ Plot") #qq plot for studentized resid

## [1] 1336 7721
#Collinearity
vifmodel<-car::vif(model3)
vifmodel
## academic$MAT_S11  academic$WC_PRO      academic$rv     academic$tvf 
##         1.054605         1.046026         1.010613         1.022315
#tolerance
1/vifmodel
## academic$MAT_S11  academic$WC_PRO      academic$rv     academic$tvf 
##        0.9482223        0.9559996        0.9894986        0.9781722

Interaction effect: Adding a term TV * MAT_S11 to understand interaction effect.

Build linear regression model4 - Outcome variable: Global score ; Predictors: Mathematics, WC_PRO, Revenue, TV, (TV*MAT_S11)

H0: There is no relationship between global score and mathematics score, written communication score, revenue, TV, (TVMAT_S11). H1: There is a relationship between global score and mathematics score, written communication score, revenue, TV, (TV MAT_S11).

academic$variable <- (academic$tvf) * academic$MAT_S11

model4=lm(academic$G_SC~academic$MAT_S11+academic$WC_PRO+academic$rv+academic$tvf+academic$variable)

summary(model4)
## 
## Call:
## lm(formula = academic$G_SC ~ academic$MAT_S11 + academic$WC_PRO + 
##     academic$rv + academic$tvf + academic$variable)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -155.783   -9.688    0.669    9.830   65.832 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       73.021149   1.926271  37.908   <2e-16 ***
## academic$MAT_S11   1.074404   0.030290  35.471   <2e-16 ***
## academic$WC_PRO    0.338025   0.004458  75.826   <2e-16 ***
## academic$rv       -0.084939   0.077797  -1.092   0.2749    
## academic$tvf       3.914812   2.042181   1.917   0.0553 .  
## academic$variable -0.009759   0.032464  -0.301   0.7637    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.57 on 12405 degrees of freedom
## Multiple R-squared:  0.6029, Adjusted R-squared:  0.6027 
## F-statistic:  3767 on 5 and 12405 DF,  p-value: < 2.2e-16
anova(model4)
## Analysis of Variance Table
## 
## Response: academic$G_SC
##                      Df  Sum Sq Mean Sq    F value  Pr(>F)    
## academic$MAT_S11      1 2748008 2748008 12948.8408 < 2e-16 ***
## academic$WC_PRO       1 1230958 1230958  5800.3753 < 2e-16 ***
## academic$rv           1     856     856     4.0322 0.04466 *  
## academic$tvf          1   16823   16823    79.2691 < 2e-16 ***
## academic$variable     1      19      19     0.0904 0.76371    
## Residuals         12405 2632594     212                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Cook's distance
cooksd<-sort(cooks.distance(model4))

plot(cooksd, pch="*", cex=2, main="Influential observation by Cooks distance")  
abline(h = 4*mean(cooksd, na.rm=T), col="red")  # add cutoff line
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T),names(cooksd),""), col="red")  # add labels

sum((cooks.distance(model4))>4*mean(cooksd))
## [1] 610
#610 
#610 observations in the population are outliers. 

influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])  # influential row numbers
stem(influential)
## 
##   The decimal point is 3 digit(s) to the right of the |
## 
##    0 | 00000111112222233334444455556666677777888888889999
##    1 | 00011111122222333333333444444444555555566666777777788888889999999
##    2 | 00000001111111223333444555566666777778889999
##    3 | 00111111112222222233333333333444444444555555566667777788888888899999
##    4 | 00000000111111111122222222233334444455667778888889999
##    5 | 000111123333344555556666666777778888999
##    6 | 000011111222233334444445555555566777777777788888899999999
##    7 | 0000111122223344455667777888999999999
##    8 | 0000011111122234444555555666777788899
##    9 | 00000001112222222233334444445556666666677777777888888889
##   10 | 00000011223445556666677777888888889999
##   11 | 001111112222222233334444555556677788899999999
##   12 | 000111111112223334444
head(academic[influential, ])  # influential observations.
## # A tibble: 6 x 48
##   COD_S11 GENDER EDU_FATHER EDU_MOTHER OCC_FATHER OCC_MOTHER STRATUM SISBEN
##   <chr>   <chr>  <chr>      <chr>      <chr>      <chr>      <chr>   <chr> 
## 1 SB1120… F      Incomplet… Complete … Independe… Home       Stratu… Level…
## 2 SB1120… M      Postgradu… Incomplet… 0          Independe… Stratu… It is…
## 3 SB1120… M      Complete … Complete … Independe… Auxiliary… Stratu… It is…
## 4 SB1120… M      Complete … Incomplet… Retired    Home       Stratu… It is…
## 5 SB1120… F      Complete … Complete … Independe… Small ent… Stratu… Level…
## 6 SB1120… M      Incomplet… Complete … Entrepren… Independe… Stratu… It is…
## # … with 40 more variables: PEOPLE_HOUSE <chr>, ...10 <lgl>, INTERNET <chr>,
## #   TV <chr>, COMPUTER <chr>, WASHING_MCH <chr>, MIC_OVEN <chr>, CAR <chr>,
## #   DVD <chr>, FRESH <chr>, PHONE <chr>, MOBILE <chr>, REVENUE <chr>,
## #   JOB <chr>, SCHOOL_NAME <chr>, SCHOOL_NAT <chr>, SCHOOL_TYPE <chr>,
## #   MAT_S11 <dbl>, CR_S11 <dbl>, CC_S11 <dbl>, BIO_S11 <dbl>, ENG_S11 <dbl>,
## #   Cod_SPro <chr>, UNIVERSITY <chr>, ACADEMIC_PROGRAM <chr>, QR_PRO <dbl>,
## #   CR_PRO <dbl>, CC_PRO <dbl>, ENG_PRO <dbl>, WC_PRO <dbl>, FEP_PRO <dbl>,
## #   G_SC <dbl>, PERCENTILE <dbl>, `2ND_DECILE` <dbl>, QUARTILE <dbl>,
## #   SEL <dbl>, SEL_IHE <dbl>, rv <dbl>, tvf <dbl>, variable <dbl>
car::outlierTest(model4) 
##         rstudent unadjusted p-value Bonferroni p
## 1336  -10.746246         8.0996e-27   1.0052e-22
## 7721   -5.512041         3.6178e-08   4.4901e-04
## 10858  -5.408441         6.4755e-08   8.0368e-04
## 2021   -4.870049         1.1295e-06   1.4018e-02
## 173    -4.814685         1.4917e-06   1.8514e-02
#1336, 7721, 10858, 2021, 173 are the outcome variables that have unusual variables for its predictor values. 

car::leveragePlots(model4) 

plot(model4,1)

#Red line is horizontally drawn near 0, it indicates the presence of linear model. 

#Homocedasticity 
plot(model4, 3)

#These plot helps us to identify the homoscedasticity of the model. Since the data is not completely random distribution of points throughout the range of X axis and flat red line, we can conclude that graph is homoscedasticity.
#Also, the plot has horizontal line with equally spread of observations.

#Create histogram and a density plot of the residuals
plot(density(resid(model4))) 

car::qqPlot(model4, main="QQ Plot Model 4") #qq plot for studentized resid

## [1] 1336 7721
#Collinearity
vifmodel<-car::vif(model4)
vifmodel
##  academic$MAT_S11   academic$WC_PRO       academic$rv      academic$tvf 
##          7.563800          1.046026          1.010961         30.826055 
## academic$variable 
##         40.186636
#Tolerance
1/vifmodel
##  academic$MAT_S11   academic$WC_PRO       academic$rv      academic$tvf 
##        0.13220868        0.95599914        0.98915809        0.03244009 
## academic$variable 
##        0.02488389

Reporting Multiple linear regression was carried out to investigate the relationship between mathematics score, written communication score, revenue, TV, interaction term (TV*Maths score) and global score. There was a significant relationship between mathematics score and global score (p < .001), written communication and global score (p < .001), TV and global score (p < .001), and revenue and global score( p < .001). No statistical significant relationship was found between interaction term and global score(p=.76) A significant regression equation was found (F(5,12405)= 3767, p < .05), with R2 of 0.6.

m1 <- lm(academic$G_SC~+academic$WC_PRO+academic$rv+academic$variable)

vifmodel<-car::vif(m1)
vifmodel
##   academic$WC_PRO       academic$rv academic$variable 
##          1.016527          1.007323          1.023649
#tolerance
1/vifmodel
##   academic$WC_PRO       academic$rv academic$variable 
##         0.9837413         0.9927298         0.9768973

Huge difference in variance inflation factor was found when the variables that contributed to interaction term were eliminated.